{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "758b8b71",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\env\\Anaconda3\\lib\\site-packages\\numpy\\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:\n",
      "D:\\env\\Anaconda3\\lib\\site-packages\\numpy\\.libs\\libopenblas.QVLO2T66WEPI7JZ63PS3HMOHFEY472BC.gfortran-win_amd64.dll\n",
      "D:\\env\\Anaconda3\\lib\\site-packages\\numpy\\.libs\\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll\n",
      "  warnings.warn(\"loaded more than 1 DLL from .libs:\"\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>android_id</th>\n",
       "      <th>apptype</th>\n",
       "      <th>carrier</th>\n",
       "      <th>dev_height</th>\n",
       "      <th>dev_ppi</th>\n",
       "      <th>dev_width</th>\n",
       "      <th>label</th>\n",
       "      <th>lan</th>\n",
       "      <th>media_id</th>\n",
       "      <th>...</th>\n",
       "      <th>os</th>\n",
       "      <th>osv</th>\n",
       "      <th>package</th>\n",
       "      <th>sid</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>version</th>\n",
       "      <th>fea_hash</th>\n",
       "      <th>location</th>\n",
       "      <th>fea1_hash</th>\n",
       "      <th>cus_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>316361</td>\n",
       "      <td>1199</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>104</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>9</td>\n",
       "      <td>18</td>\n",
       "      <td>1438873</td>\n",
       "      <td>1.559893e+12</td>\n",
       "      <td>8</td>\n",
       "      <td>2135019403</td>\n",
       "      <td>0</td>\n",
       "      <td>2329670524</td>\n",
       "      <td>601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>135939</td>\n",
       "      <td>893</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1</td>\n",
       "      <td>0</td>\n",
       "      <td>1185582</td>\n",
       "      <td>1.559994e+12</td>\n",
       "      <td>4</td>\n",
       "      <td>2782306428</td>\n",
       "      <td>1</td>\n",
       "      <td>2864801071</td>\n",
       "      <td>1000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>399254</td>\n",
       "      <td>821</td>\n",
       "      <td>0.0</td>\n",
       "      <td>760.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>559</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1555716</td>\n",
       "      <td>1.559837e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1392806005</td>\n",
       "      <td>2</td>\n",
       "      <td>628911675</td>\n",
       "      <td>696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>68983</td>\n",
       "      <td>1004</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2214.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>129</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1093419</td>\n",
       "      <td>1.560042e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3562553457</td>\n",
       "      <td>3</td>\n",
       "      <td>1283809327</td>\n",
       "      <td>753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>288999</td>\n",
       "      <td>1076</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>64</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1400089</td>\n",
       "      <td>1.559867e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>2364522023</td>\n",
       "      <td>4</td>\n",
       "      <td>1510695983</td>\n",
       "      <td>582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499995</th>\n",
       "      <td>499995</td>\n",
       "      <td>392477</td>\n",
       "      <td>1028</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1920.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>144</td>\n",
       "      <td>...</td>\n",
       "      <td>Android</td>\n",
       "      <td>7.1.2</td>\n",
       "      <td>25</td>\n",
       "      <td>1546078</td>\n",
       "      <td>1.559834e+12</td>\n",
       "      <td>7</td>\n",
       "      <td>861755946</td>\n",
       "      <td>79</td>\n",
       "      <td>140647032</td>\n",
       "      <td>373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499996</th>\n",
       "      <td>499996</td>\n",
       "      <td>346134</td>\n",
       "      <td>1001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1424.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1480612</td>\n",
       "      <td>1.559814e+12</td>\n",
       "      <td>3</td>\n",
       "      <td>1714444511</td>\n",
       "      <td>23</td>\n",
       "      <td>2745131047</td>\n",
       "      <td>525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499997</th>\n",
       "      <td>499997</td>\n",
       "      <td>499635</td>\n",
       "      <td>761</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>1280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>6.0.1</td>\n",
       "      <td>9</td>\n",
       "      <td>1698442</td>\n",
       "      <td>1.559676e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3843262581</td>\n",
       "      <td>25</td>\n",
       "      <td>1326115882</td>\n",
       "      <td>810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499998</th>\n",
       "      <td>499998</td>\n",
       "      <td>239786</td>\n",
       "      <td>917</td>\n",
       "      <td>46001.0</td>\n",
       "      <td>960.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>0</td>\n",
       "      <td>zh_CN</td>\n",
       "      <td>109</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>5.1.1</td>\n",
       "      <td>0</td>\n",
       "      <td>1331155</td>\n",
       "      <td>1.559840e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1984296118</td>\n",
       "      <td>225</td>\n",
       "      <td>1446741112</td>\n",
       "      <td>772</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499999</th>\n",
       "      <td>499999</td>\n",
       "      <td>270531</td>\n",
       "      <td>929</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2040.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>59</td>\n",
       "      <td>...</td>\n",
       "      <td>Android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>78</td>\n",
       "      <td>1373973</td>\n",
       "      <td>1.559922e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>1697301943</td>\n",
       "      <td>49</td>\n",
       "      <td>1915763579</td>\n",
       "      <td>1076</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500000 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  android_id  apptype  carrier  dev_height  dev_ppi  \\\n",
       "0                0      316361     1199  46000.0         0.0      0.0   \n",
       "1                1      135939      893      0.0         0.0      0.0   \n",
       "2                2      399254      821      0.0       760.0      0.0   \n",
       "3                3       68983     1004  46000.0      2214.0      0.0   \n",
       "4                4      288999     1076  46000.0      2280.0      0.0   \n",
       "...            ...         ...      ...      ...         ...      ...   \n",
       "499995      499995      392477     1028  46000.0      1920.0      3.0   \n",
       "499996      499996      346134     1001      0.0      1424.0      0.0   \n",
       "499997      499997      499635      761  46000.0      1280.0      0.0   \n",
       "499998      499998      239786      917  46001.0       960.0      0.0   \n",
       "499999      499999      270531      929  46000.0      2040.0      3.0   \n",
       "\n",
       "        dev_width  label    lan  media_id  ...       os    osv package  \\\n",
       "0             0.0      1    NaN       104  ...  android      9      18   \n",
       "1             0.0      1    NaN        19  ...  android    8.1       0   \n",
       "2           360.0      1    NaN       559  ...  android  8.1.0       0   \n",
       "3          1080.0      0    NaN       129  ...  android  8.1.0       0   \n",
       "4          1080.0      1  zh-CN        64  ...  android  8.0.0       0   \n",
       "...           ...    ...    ...       ...  ...      ...    ...     ...   \n",
       "499995     1080.0      1  zh-CN       144  ...  Android  7.1.2      25   \n",
       "499996      720.0      0    NaN        29  ...  android  8.1.0       0   \n",
       "499997      720.0      0    NaN        54  ...  android  6.0.1       9   \n",
       "499998      540.0      0  zh_CN       109  ...  android  5.1.1       0   \n",
       "499999     1080.0      1  zh-CN        59  ...  Android  8.1.0      78   \n",
       "\n",
       "            sid     timestamp  version    fea_hash location   fea1_hash  \\\n",
       "0       1438873  1.559893e+12        8  2135019403        0  2329670524   \n",
       "1       1185582  1.559994e+12        4  2782306428        1  2864801071   \n",
       "2       1555716  1.559837e+12        0  1392806005        2   628911675   \n",
       "3       1093419  1.560042e+12        0  3562553457        3  1283809327   \n",
       "4       1400089  1.559867e+12        5  2364522023        4  1510695983   \n",
       "...         ...           ...      ...         ...      ...         ...   \n",
       "499995  1546078  1.559834e+12        7   861755946       79   140647032   \n",
       "499996  1480612  1.559814e+12        3  1714444511       23  2745131047   \n",
       "499997  1698442  1.559676e+12        0  3843262581       25  1326115882   \n",
       "499998  1331155  1.559840e+12        0  1984296118      225  1446741112   \n",
       "499999  1373973  1.559922e+12        5  1697301943       49  1915763579   \n",
       "\n",
       "        cus_type  \n",
       "0            601  \n",
       "1           1000  \n",
       "2            696  \n",
       "3            753  \n",
       "4            582  \n",
       "...          ...  \n",
       "499995       373  \n",
       "499996       525  \n",
       "499997       810  \n",
       "499998       772  \n",
       "499999      1076  \n",
       "\n",
       "[500000 rows x 21 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import warnings\n",
    "import pickle\n",
    "\n",
    "# 数据加载\n",
    "train = pd.read_csv('./train.csv')\n",
    "test = pd.read_csv('./test.csv')\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3de5943d",
   "metadata": {},
   "outputs": [],
   "source": [
    "test['label'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "534069ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>android_id</th>\n",
       "      <th>apptype</th>\n",
       "      <th>carrier</th>\n",
       "      <th>dev_height</th>\n",
       "      <th>dev_ppi</th>\n",
       "      <th>dev_width</th>\n",
       "      <th>label</th>\n",
       "      <th>lan</th>\n",
       "      <th>media_id</th>\n",
       "      <th>...</th>\n",
       "      <th>os</th>\n",
       "      <th>osv</th>\n",
       "      <th>package</th>\n",
       "      <th>sid</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>version</th>\n",
       "      <th>fea_hash</th>\n",
       "      <th>location</th>\n",
       "      <th>fea1_hash</th>\n",
       "      <th>cus_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>316361</td>\n",
       "      <td>1199</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>104</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>9</td>\n",
       "      <td>18</td>\n",
       "      <td>1438873</td>\n",
       "      <td>1.559893e+12</td>\n",
       "      <td>8</td>\n",
       "      <td>2135019403</td>\n",
       "      <td>0</td>\n",
       "      <td>2329670524</td>\n",
       "      <td>601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>135939</td>\n",
       "      <td>893</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1</td>\n",
       "      <td>0</td>\n",
       "      <td>1185582</td>\n",
       "      <td>1.559994e+12</td>\n",
       "      <td>4</td>\n",
       "      <td>2782306428</td>\n",
       "      <td>1</td>\n",
       "      <td>2864801071</td>\n",
       "      <td>1000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>399254</td>\n",
       "      <td>821</td>\n",
       "      <td>0.0</td>\n",
       "      <td>760.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>559</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1555716</td>\n",
       "      <td>1.559837e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1392806005</td>\n",
       "      <td>2</td>\n",
       "      <td>628911675</td>\n",
       "      <td>696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>68983</td>\n",
       "      <td>1004</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2214.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>129</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1093419</td>\n",
       "      <td>1.560042e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3562553457</td>\n",
       "      <td>3</td>\n",
       "      <td>1283809327</td>\n",
       "      <td>753</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>288999</td>\n",
       "      <td>1076</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>2280.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>64</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>8.0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1400089</td>\n",
       "      <td>1.559867e+12</td>\n",
       "      <td>5</td>\n",
       "      <td>2364522023</td>\n",
       "      <td>4</td>\n",
       "      <td>1510695983</td>\n",
       "      <td>582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149995</th>\n",
       "      <td>149995</td>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>760.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>0</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>29</td>\n",
       "      <td>...</td>\n",
       "      <td>Android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>4</td>\n",
       "      <td>1165373</td>\n",
       "      <td>1.559957e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>3162887451</td>\n",
       "      <td>126</td>\n",
       "      <td>2711576615</td>\n",
       "      <td>411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149996</th>\n",
       "      <td>149996</td>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>780.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>0</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>29</td>\n",
       "      <td>...</td>\n",
       "      <td>Android</td>\n",
       "      <td>9.0.0</td>\n",
       "      <td>4</td>\n",
       "      <td>1444115</td>\n",
       "      <td>1.559863e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>97238959</td>\n",
       "      <td>322</td>\n",
       "      <td>2678022183</td>\n",
       "      <td>411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149997</th>\n",
       "      <td>149997</td>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>780.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>0</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>29</td>\n",
       "      <td>...</td>\n",
       "      <td>Android</td>\n",
       "      <td>8.1.0</td>\n",
       "      <td>4</td>\n",
       "      <td>1134378</td>\n",
       "      <td>1.560041e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>1320118495</td>\n",
       "      <td>46</td>\n",
       "      <td>2610913319</td>\n",
       "      <td>411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149998</th>\n",
       "      <td>149998</td>\n",
       "      <td>500925</td>\n",
       "      <td>1052</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>854.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>480.0</td>\n",
       "      <td>0</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>249</td>\n",
       "      <td>...</td>\n",
       "      <td>android</td>\n",
       "      <td>4.4.2</td>\n",
       "      <td>0</td>\n",
       "      <td>1700238</td>\n",
       "      <td>1.559688e+12</td>\n",
       "      <td>2</td>\n",
       "      <td>1292986591</td>\n",
       "      <td>41</td>\n",
       "      <td>1898209327</td>\n",
       "      <td>430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149999</th>\n",
       "      <td>149999</td>\n",
       "      <td>0</td>\n",
       "      <td>1001</td>\n",
       "      <td>46000.0</td>\n",
       "      <td>780.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>0</td>\n",
       "      <td>zh-CN</td>\n",
       "      <td>29</td>\n",
       "      <td>...</td>\n",
       "      <td>Android</td>\n",
       "      <td>9.0.0</td>\n",
       "      <td>4</td>\n",
       "      <td>1201539</td>\n",
       "      <td>1.559950e+12</td>\n",
       "      <td>0</td>\n",
       "      <td>259614175</td>\n",
       "      <td>122</td>\n",
       "      <td>2594136103</td>\n",
       "      <td>411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>650000 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  android_id  apptype  carrier  dev_height  dev_ppi  \\\n",
       "0                0      316361     1199  46000.0         0.0      0.0   \n",
       "1                1      135939      893      0.0         0.0      0.0   \n",
       "2                2      399254      821      0.0       760.0      0.0   \n",
       "3                3       68983     1004  46000.0      2214.0      0.0   \n",
       "4                4      288999     1076  46000.0      2280.0      0.0   \n",
       "...            ...         ...      ...      ...         ...      ...   \n",
       "149995      149995           0     1001  46000.0       760.0      0.0   \n",
       "149996      149996           0     1001  46000.0       780.0      0.0   \n",
       "149997      149997           0     1001  46000.0       780.0      0.0   \n",
       "149998      149998      500925     1052  46000.0       854.0    240.0   \n",
       "149999      149999           0     1001  46000.0       780.0      0.0   \n",
       "\n",
       "        dev_width  label    lan  media_id  ...       os    osv package  \\\n",
       "0             0.0      1    NaN       104  ...  android      9      18   \n",
       "1             0.0      1    NaN        19  ...  android    8.1       0   \n",
       "2           360.0      1    NaN       559  ...  android  8.1.0       0   \n",
       "3          1080.0      0    NaN       129  ...  android  8.1.0       0   \n",
       "4          1080.0      1  zh-CN        64  ...  android  8.0.0       0   \n",
       "...           ...    ...    ...       ...  ...      ...    ...     ...   \n",
       "149995      360.0      0  zh-CN        29  ...  Android  8.1.0       4   \n",
       "149996      360.0      0  zh-CN        29  ...  Android  9.0.0       4   \n",
       "149997      360.0      0  zh-CN        29  ...  Android  8.1.0       4   \n",
       "149998      480.0      0  zh-CN       249  ...  android  4.4.2       0   \n",
       "149999      360.0      0  zh-CN        29  ...  Android  9.0.0       4   \n",
       "\n",
       "            sid     timestamp  version    fea_hash location   fea1_hash  \\\n",
       "0       1438873  1.559893e+12        8  2135019403        0  2329670524   \n",
       "1       1185582  1.559994e+12        4  2782306428        1  2864801071   \n",
       "2       1555716  1.559837e+12        0  1392806005        2   628911675   \n",
       "3       1093419  1.560042e+12        0  3562553457        3  1283809327   \n",
       "4       1400089  1.559867e+12        5  2364522023        4  1510695983   \n",
       "...         ...           ...      ...         ...      ...         ...   \n",
       "149995  1165373  1.559957e+12        0  3162887451      126  2711576615   \n",
       "149996  1444115  1.559863e+12        0    97238959      322  2678022183   \n",
       "149997  1134378  1.560041e+12        0  1320118495       46  2610913319   \n",
       "149998  1700238  1.559688e+12        2  1292986591       41  1898209327   \n",
       "149999  1201539  1.559950e+12        0   259614175      122  2594136103   \n",
       "\n",
       "        cus_type  \n",
       "0            601  \n",
       "1           1000  \n",
       "2            696  \n",
       "3            753  \n",
       "4            582  \n",
       "...          ...  \n",
       "149995       411  \n",
       "149996       411  \n",
       "149997       411  \n",
       "149998       430  \n",
       "149999       411  \n",
       "\n",
       "[650000 rows x 21 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "full_data_df = pd.concat([train, test])\n",
    "full_data_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9be06c65",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unnamed: 0存在着500000个不同的值\n",
      "android_id存在着362258个不同的值\n",
      "apptype存在着89个不同的值\n",
      "carrier存在着5个不同的值\n",
      "dev_height存在着798个不同的值\n",
      "dev_ppi存在着92个不同的值\n",
      "dev_width存在着346个不同的值\n",
      "label存在着2个不同的值\n",
      "lan存在着22个不同的值\n",
      "media_id存在着284个不同的值\n",
      "ntt存在着8个不同的值\n",
      "os存在着2个不同的值\n",
      "osv存在着155个不同的值\n",
      "package存在着1950个不同的值\n",
      "sid存在着500000个不同的值\n",
      "timestamp存在着500000个不同的值\n",
      "version存在着22个不同的值\n",
      "fea_hash存在着402980个不同的值\n",
      "location存在着332个不同的值\n",
      "fea1_hash存在着4959个不同的值\n",
      "cus_type存在着58个不同的值\n"
     ]
    }
   ],
   "source": [
    "for i in full_data_df.columns:\n",
    "    print('{}存在着{}个不同的值'.format(i,len(set(train[i]))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "93900b60",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Android', 'android'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(train['os'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "251e6dbc",
   "metadata": {},
   "source": [
    "- android_id、sid、timestamp、fea_hash 不需要做Embedding\n",
    "- os同一个值，不需要加入计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "0e2284a3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "apptype 字典生成完毕,共 89 个id\n",
      "carrier 字典生成完毕,共 5 个id\n",
      "dev_height 字典生成完毕,共 864 个id\n",
      "dev_ppi 字典生成完毕,共 105 个id\n",
      "dev_width 字典生成完毕,共 382 个id\n",
      "lan 字典生成完毕,共 25 个id\n",
      "media_id 字典生成完毕,共 292 个id\n",
      "ntt 字典生成完毕,共 8 个id\n",
      "osv 字典生成完毕,共 165 个id\n",
      "package 字典生成完毕,共 2102 个id\n",
      "version 字典生成完毕,共 23 个id\n",
      "location 字典生成完毕,共 332 个id\n",
      "fea1_hash 字典生成完毕,共 6147 个id\n",
      "cus_type 字典生成完毕,共 58 个id\n"
     ]
    }
   ],
   "source": [
    "embed_list = ['apptype','carrier','dev_height','dev_ppi','dev_width','lan','media_id','ntt','osv','package','version','location','fea1_hash','cus_type']\n",
    "\n",
    "\n",
    "def save_obj(obj, name):\n",
    "    with open(name + '.pkl', 'wb') as f:\n",
    "        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)\n",
    " \n",
    " \n",
    "def load_obj(name):\n",
    "    with open(name + '.pkl', 'rb') as f:\n",
    "        return pickle.load(f)\n",
    "\n",
    "\n",
    "\n",
    "def gen_embed_vocab(data, col, vocab_path):\n",
    "    data_hash = dict()\n",
    "    # 制作字典\n",
    "    for sample in data:\n",
    "        sample = str(sample)\n",
    "        if sample not in data_hash:\n",
    "            data_hash[sample] = len(data_hash) + 1\n",
    "    # 保存字典\n",
    "#     save_obj(data_hash, vocab_path+col+'_vocab')\n",
    "    print(col, \"字典生成完毕,共\", len(data_hash), \"个id\")\n",
    "    return data_hash\n",
    "\n",
    "\n",
    "for col in embed_list:\n",
    "    a = gen_embed_vocab(full_data_df[col], col, 'vocab/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0db75885",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'601': 1,\n",
       " '1000': 2,\n",
       " '696': 3,\n",
       " '753': 4,\n",
       " '582': 5,\n",
       " '430': 6,\n",
       " '411': 7,\n",
       " '886': 8,\n",
       " '335': 9,\n",
       " '848': 10,\n",
       " '449': 11,\n",
       " '316': 12,\n",
       " '392': 13,\n",
       " '297': 14,\n",
       " '1171': 15,\n",
       " '658': 16,\n",
       " '1361': 17,\n",
       " '1285': 18,\n",
       " '772': 19,\n",
       " '1152': 20,\n",
       " '468': 21,\n",
       " '1057': 22,\n",
       " '506': 23,\n",
       " '620': 24,\n",
       " '829': 25,\n",
       " '1380': 26,\n",
       " '544': 27,\n",
       " '1228': 28,\n",
       " '563': 29,\n",
       " '487': 30,\n",
       " '791': 31,\n",
       " '1190': 32,\n",
       " '525': 33,\n",
       " '924': 34,\n",
       " '1323': 35,\n",
       " '1304': 36,\n",
       " '943': 37,\n",
       " '1133': 38,\n",
       " '373': 39,\n",
       " '715': 40,\n",
       " '1076': 41,\n",
       " '1114': 42,\n",
       " '1247': 43,\n",
       " '639': 44,\n",
       " '734': 45,\n",
       " '1209': 46,\n",
       " '1095': 47,\n",
       " '981': 48,\n",
       " '1266': 49,\n",
       " '1342': 50,\n",
       " '1019': 51,\n",
       " '1038': 52,\n",
       " '962': 53,\n",
       " '810': 54,\n",
       " '867': 55,\n",
       " '677': 56,\n",
       " '354': 57,\n",
       " '905': 58}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d5bce8e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
